home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Tech Arsenal 1
/
Tech Arsenal (Arsenal Computer).ISO
/
tek-13
/
trs141f.zip
/
NFMT.ZIP
/
NFMT.C
< prev
next >
Wrap
C/C++ Source or Header
|
1992-07-20
|
16KB
|
681 lines
#include <stdio.h>
#include <ctype.h>
#include "config.h"
#include "options.h"
/*
* NAME
* nfmt - simple text formatter
*
* SYNOPSIS
* usage: nfmt [ -cs ] [ -width ] [ -p prefix ] [ file ... ]
*
* DESCRIPTION
* Nfmt is a simple text formatter, similar to the BSD program fmt(1),
* but uses best-fit line breaking, by a simple version of
*
* "Breaking Paragraphs into Lines",
* Donald E. Knuth and Michael F. Plass,
* "Software--Practice and Experience" 11 (1981) 1119-1184.
*
* Tabs are expanded on input and re-introduced on output.
*
* OPTIONS
* -c Crown margin mode. The indentation of the first line
* of a paragraph must be different from the indentation
* of the second. Subsequent lines must have the same
* indentation as the second line.
*
* -s Split lines only.
*
* -width Maximum line width (default 75). Fmt prefers to make
* lines about 7% shorter, to give it room to balance line
* lengths.
*
* -p prefix
* Only lines beginning with the prefix (possibly preceded
* by white space) are re-arranged; the prefix (with any
* preceding white space) is stripped for the formatting and
* re-attached to each formatted output line. For example,
*
* nfmt -p ' * '
*
* formats C comments laid out in the normal way, leaving
* the code unchanged. The -p option may also be combined
* with the other options.
*
* AUTHOR
* Ross Paterson <rap@doc.ic.ac.uk>
* Packaging and portability by Yossi Gil <yogi@cs.ubc.ca>
*/
/*
* The following parameters represent the program's idea of what is "best".
* Adjust to taste, subject to the caveats given.
*/
#define WIDTH 75 /* longest permitted line length */
#define LEEWAY 15 /* 1/LEEWAY of line is best left unused */
/*
* Costs and bonuses are expressed as the equivalent departure from
* the optimal line length, multiplied by 10.
* e.g. assigning something a cost of 50 means that it is as bad as
* a line 5 characters too short or long.
* The definition of SHORT_COST(n) should not be changed.
* However, EQUIV(n) may need tuning.
*/
typedef long COST;
#define MAXCOST (~(((COST)1)<<(8*sizeof(COST)-1)))
#define SQR(n) ((n)*(n))
#define EQUIV(n) SQR((COST)(n))
/* cost of a filled line longer or shorter than best_width */
#define SHORT_COST(n) EQUIV((n)*10)
/* cost of the difference between adjacent filled lines */
#define RAGGED_COST(n) (SHORT_COST(n)/2)
/* basic cost per line */
#define LINE_COST EQUIV(70)
/* line break after the first word of a sentence */
#define WIDOW_COST EQUIV(200)
/* line break before the last word of a sentence */
#define ORPHAN_COST EQUIV(150)
/* line break at the end of a sentence */
#define SENTENCE_BONUS EQUIV(50)
/* line break after close parenthesis */
#define PAREN_BONUS EQUIV(40)
/* line break after other punctuation */
#define PUNCT_BONUS EQUIV(40)
/* credit for breaking a long paragraph 1 line later */
#define LINE_CREDIT EQUIV(30)
/*
* Size of paragraph buffer, in words and characters.
* Longer paragraphs are handled (cf. flush_paragraph()).
*/
#define MAXWORDS 1000
#define MAXCHARS 5000
/*
* Extra ctype(3)-style macros
*/
#define isopen(c) (strchr("([`'\"", c) != NULL)
#define isclose(c) (strchr(")]'\"", c) != NULL)
#define isperiod(c) (strchr(".?!", c) != NULL)
/*
* Miscellaneous definitions
*/
#define reg register
typedef int bool;
#define TRUE 1
#define FALSE 0
#define until(c) if (c) break
/*
* Word descriptor structure
*/
#define WORD struct _WORD
WORD {
/* static attributes determined during input */
char *text;
short length; /* length of this word */
short space; /* the size of the following space */
bool paren:1; /* starts with open paren */
bool period:1; /* ends in [.?!])* */
bool punct:1; /* ends in punctuation */
/* the remaining fields are computed during the optimization */
short line_length; /* length of the best line starting here */
COST best_cost;
WORD *next_break; /* break which achieves best_cost */
};
/*
* Forward declarations
*/
extern void trim_prefix ARGS((void));
extern void fmt ARGS((void));
extern bool get_paragraph ARGS((void));
extern int get_line ARGS((int c));
extern int get_prefix ARGS((void));
extern int get_space ARGS((int c));
extern int copy_rest ARGS((int c));
extern bool same_para ARGS((int c));
extern void flush_paragraph ARGS((void));
extern void fmt_paragraph ARGS((void));
extern void check_punctuation ARGS((WORD *w, char *finish));
extern COST base_cost ARGS((WORD *this));
extern COST line_cost ARGS((WORD *next, int len));
extern void put_paragraph ARGS((WORD *finish));
extern void put_line ARGS((WORD *w));
extern void put_prefix ARGS((int c));
extern void put_space ARGS((int space));
FILE *infile;
/*
* Option values and derived quantities
*/
bool crown = FALSE;
bool split = FALSE; /* each line is a paragraph on its own */
char default_prefix[] = "";
char *prefix = default_prefix;
int prefix_length;
int prefix_lead_space;
int prefix_full_length;
int max_width = WIDTH;
int best_width;
main(argc, argv)
reg int argc;
reg char *argv[];
{
reg int i;
OPTIONS ("-cs -width -p prefix [ file ... ]")
FLAG ('c', crown)
FLAG ('s', split)
NUM_OPT (max_width)
STRING ('p', prefix)
ENDOPTS
best_width = max_width*(LEEWAY-1)/LEEWAY;
trim_prefix();
if (argc == 1) {
infile = stdin;
fmt();
}
else
for (i = 1; i < argc; i++)
if ((infile = fopen(argv[i], "r")) == NULL)
fprintf(stderr, "%s: can't read file '%s'\n",
argv[0], argv[i]);
else {
fmt();
fclose(infile);
}
return 0;
}
void
trim_prefix()
{
reg char *s;
prefix_lead_space = 0;
while (*prefix == ' ') {
prefix_lead_space++;
prefix++;
}
prefix_full_length = strlen(prefix);
s = prefix + prefix_full_length;
while (s > prefix && s[-1] == ' ')
s--;
*s = '\0';
prefix_length = s - prefix;
}
int in_column = 0;
int out_column = 0;
char parabuf[MAXCHARS]; /* space for the paragraph text */
WORD word[MAXWORDS];
WORD *word_limit;
char *wptr;
int first_indent; /* indentation of first line */
int indent; /* indentation of rest of current paragraph */
int prefix_indent;
int next_prefix_indent;
bool tabs = FALSE; /* tabs in input? */
int next_char; /* one-character look-ahead */
char *to_match = "";
void
fmt()
{
next_char = get_prefix();
while (get_paragraph()) {
fmt_paragraph();
put_paragraph(word_limit);
}
}
/*
* Definitions.
*
* A <paragraph> is a maximal non-empty set of consecutive non-blank
* lines at the same indent.
* In split mode, a paragraph is a single non-blank line.
* In crown mode, the second and subsequent lines must have the same
* indentation, but differing from the first line.
* If a prefix is in effect, it must also be at the same indent for
* each line in the paragraph.
*
* A <word> is a maximal non-empty set of non-white characters.
*
* A <sentence break> is either the end of a paragraph or a word ending
* in [.?!], possibly followed by ) or ], followed by a word beginning
* with a capital.
*/
bool
get_paragraph()
{
reg int c;
c = next_char;
/*
* Scan (and copy) blank lines,
* and lines not introduced by the prefix.
*/
while (c == '\n' || *to_match ||
next_prefix_indent < prefix_lead_space ||
in_column < next_prefix_indent + prefix_full_length) {
if (prefix_length != 0)
c = copy_rest(c);
until(c == EOF);
putchar('\n');
c = get_prefix();
}
if (c == EOF) {
next_char = EOF;
return FALSE;
}
prefix_indent = next_prefix_indent;
wptr = parabuf;
word_limit = word;
if (split) {
first_indent = indent = in_column;
c = get_line(c);
}
else if (crown) {
first_indent = in_column;
c = get_line(c);
if (same_para(c) && in_column != first_indent) {
indent = in_column;
do { /* for each line till the end of the para */
c = get_line(c);
} while (same_para(c) && in_column == indent);
}
/*
* If only one line, use the secondary indent
* from last time (initially 0) if it splits.
*/
}
else {
first_indent = indent = in_column;
do { /* for each line till the end of the para */
c = get_line(c);
} while (same_para(c) && in_column == indent);
}
(word_limit-1)->period = TRUE;
next_char = c;
return TRUE;
}
/*
* Copy a line which failed to match the prefix to the output,
* or which was blank after the prefix.
*
* In the former case, c is the character that failed to match *to_match.
* In the latter, c is \n or EOF.
* Returns the character ending the line.
*/
int
copy_rest(c)
reg int c;
{
reg char *s;
out_column = 0;
put_space(next_prefix_indent);
for (s = prefix; s != to_match; s++)
putchar(*s);
if (c != '\n' && c != EOF) {
out_column += to_match - prefix;
put_space(in_column - out_column);
do {
putchar(c);
c = getc(infile);
} while (c != '\n' && c != EOF);
}
return c;
}
bool
same_para(c)
reg int c;
{
return next_prefix_indent == prefix_indent && *to_match == '\0' &&
c != '\n' && c != EOF;
}
/*
* Read a line, given first non-blank character c, and the following
* indent, returning the first non-blank character of the next line.
*/
int
get_line(c)
reg int c;
{
int start;
reg char *end_of_parabuf;
reg WORD *end_of_word;
end_of_parabuf = ¶buf[MAXCHARS];
end_of_word = &word[MAXWORDS-2];
do { /* for each word in a line */
/* scan word */
if (islower(c) && word_limit != word)
(word_limit-1)->period = FALSE;
word_limit->text = wptr;
do {
if (wptr == end_of_parabuf)
flush_paragraph();
*wptr++ = c;
c = getc(infile);
} while (c != EOF && ! isspace(c));
check_punctuation(word_limit, wptr-1);
in_column += word_limit->length = wptr - word_limit->text;
if (wptr == end_of_parabuf || word_limit == end_of_word)
flush_paragraph();
*wptr++ = '\0';
/* scan inter-word space */
start = in_column;
c = get_space(c);
word_limit->space = in_column - start;
word_limit++;
if (c == EOF)
return EOF;
} while (c != '\n');
c = get_prefix();
(word_limit-1)->space =
c != EOF && ! islower(c) && (word_limit-1)->period ? 2 : 1;
return c;
}
int
get_prefix()
{
reg int c;
in_column = 0;
c = get_space(getc(infile));
to_match = prefix;
if (prefix_length == 0)
next_prefix_indent = 0;
else {
next_prefix_indent = in_column;
while (*to_match && c == *to_match) {
in_column++;
to_match++;
c = getc(infile);
}
if (*to_match == '\0')
c = get_space(c);
}
return c;
}
/*
* Scan blank characters, keeping in_column up-to-date.
*/
int
get_space(c)
reg int c;
{
for (;;) {
if (c == ' ')
in_column++;
else if (c == '\t') {
tabs = TRUE;
in_column = (in_column/8 + 1)*8;
}
else
return c;
c = getc(infile);
}
}
void
check_punctuation(w, finish)
reg WORD *w;
reg char *finish;
{
reg char *start;
start = w->text;
w->paren = isopen(*start);
w->punct = ispunct(*finish);
while (isclose(*finish) && finish > start)
finish--;
w->period = isperiod(*finish);
}
/*
* On hitting the limit, flush part of the paragraph to make room.
*/
void
flush_paragraph()
{
WORD saved_last_word;
WORD *split_point;
reg WORD *w;
int shift;
COST best_break;
/*
* In the special case where it's all one word, we just flush it.
*/
if (word_limit == word) {
printf("%*s", wptr-parabuf, parabuf);
wptr = parabuf;
return;
}
/*
* Otherwise:
* - format what you have so far as a paragraph,
* - find a low-cost line break near the end,
* - output to there,
* - make that the start of the paragraph.
*/
saved_last_word = *word_limit;
fmt_paragraph();
/* choose a good split point */
split_point = word;
best_break = MAXCOST;
for (w = word->next_break; w != word_limit; w = w->next_break) {
if (w->best_cost - w->next_break->best_cost < best_break) {
split_point = w;
best_break = w->best_cost - w->next_break->best_cost;
}
if (best_break <= MAXCOST - LINE_CREDIT)
best_break += LINE_CREDIT;
}
if (split_point == word) {
fprintf(stderr, "%s: panic\n", O_name);
exit(1);
}
put_paragraph(split_point);
*word_limit = saved_last_word;
/* copy text of words down to start of parabuf */
memcpy(parabuf, split_point->text, wptr - split_point->text);
shift = split_point->text - parabuf;
wptr -= shift;
/* adjust text pointers */
for (w = split_point; w <= word_limit; w++)
w->text -= shift;
/* copy words from split_point down to word */
memcpy((char *)word, (char *)split_point,
(word_limit - split_point + 1)*sizeof(WORD));
word_limit -= split_point - word;
}
/*
* Compute the optimal formatting for the whole paragraph by computing and
* remembering the optimal formatting for each suffix from the empty one
* to the whole paragraph.
*/
void
fmt_paragraph()
{
reg WORD *start, *w;
reg int len;
reg COST wcost, best;
word_limit->best_cost = 0;
word_limit->length = max_width; /* sentinel */
for (start = word_limit-1; start >= word; start--) {
best = MAXCOST;
len = start == word ? first_indent : indent;
/* at least one word, however long, in the line */
w = start;
len += w->length;
do {
w++;
/* consider breaking before w */
wcost = line_cost(w, len) + w->best_cost;
if (wcost < best) {
best = wcost;
start->next_break = w;
start->line_length = len;
}
len += (w-1)->space; /* w > start >= word */
len += w->length;
} while (len < max_width);
start->best_cost = best + base_cost(start);
}
}
/* constant component of cost of breaking before this word */
COST
base_cost(this)
reg WORD *this;
{
reg COST cost;
cost = LINE_COST;
if (this > word)
if ((this-1)->period)
cost -= SENTENCE_BONUS;
else if ((this-1)->punct)
cost -= PUNCT_BONUS;
else if (this > word+1 && (this-2)->period)
cost += WIDOW_COST/((this-1)->length+2);
if (this->paren)
cost -= PAREN_BONUS;
else if (this->period)
cost += ORPHAN_COST/(this->length+2);
return cost;
}
/* length-dependent component of cost of breaking at next */
COST
line_cost(next, len)
reg WORD *next;
reg int len;
{
reg int n;
reg COST cost;
if (next == word_limit)
return 0;
n = best_width - len;
cost = SHORT_COST(n);
if (next->next_break != word_limit) {
n = len - next->line_length;
cost += RAGGED_COST(n);
}
return cost;
}
void
put_paragraph(finish)
reg WORD *finish; /* must be in the next_break chain from word */
{
reg WORD *w;
out_column = 0;
put_prefix(first_indent);
put_line(word);
for (w = word->next_break; w != finish; w = w->next_break) {
put_prefix(indent);
put_line(w);
}
}
void
put_line(w)
reg WORD *w;
{
reg WORD *endline;
endline = w->next_break-1;
out_column += w->length;
fputs(w->text, stdout);
while (w != endline) {
put_space(w->space);
w++;
fputs(w->text, stdout);
out_column += w->length;
}
putchar('\n');
}
void
put_prefix(space)
int space;
{
out_column = 0;
put_space(prefix_indent);
fputs(prefix, stdout);
out_column += prefix_length;
put_space(space - out_column);
}
void
put_space(space)
int space;
{
reg int space_target, tab_target;
space_target = out_column + space;
if (tabs) {
tab_target = space_target/8*8;
if (out_column+1 < tab_target)
while (out_column < tab_target) {
putchar('\t');
out_column = (out_column/8 + 1)*8;
}
}
while (out_column < space_target) {
putchar(' ');
out_column++;
}
}